## packages
library(tidyverse)
library(tidytext)
library(sf)
library(maps)
library(albersusa)
library(rgeocodio)
library(geojsonio)
library(rgeos)
library(ggwordcloud)
library(patchwork)
library(pdftools)
library(showtext)
## save plots?
save <- TRUE
#save <- FALSE
## quality of png's
dpi <- 750
## font
font_add_google("Montserrat", "Montserrat")
font_add_google("Overpass", "Overpass")
font_add_google("Overpass Mono", "Overpass Mono")
## theme updates
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 16,
face = "bold",
lineheight = 1.15,
hjust = .5,
margin = margin(10, 0, 25, 0)),
#plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", colorspace::darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)df_gd <- readr::read_csv(here::here("raw_data", "Glassdoor - 2020-04-01-0.csv")) %>%
janitor::clean_names() %>%
dplyr::select(-url)df_li <- readr::read_csv(here::here("raw_data", "Linkedin.csv")) %>%
janitor::clean_names() %>%
filter(str_detect(job_title, " seo | SEO")) %>%
mutate(size = as.character(size))## prepare GlassDoord data for joining
df_gd_join <-
df_gd %>%
mutate(
employment_type = case_when(
str_detect(description, "ulltime|ull-time|ull-Time") ~ "Full-time",
str_detect(description, "arttime|art-time|art-Time") ~ "Part-time",
TRUE ~ NA_character_
),
seniority = case_when(
str_detect(description, "nternship") ~ "Internship",
str_detect(job_title, "intern|Intern") ~ "Internship",
str_detect(description, "junior|Junior") ~ "Junior",
str_detect(job_title, "junior|Junior") ~ "Junior",
str_detect(description, "senior|Senior") ~ "Senior",
str_detect(job_title, "senior|Senior") ~ "Senior",
str_detect(description, "ntry level|ntry Level") ~ "Entry level",
str_detect(job_title, "id-Senior|id-senior|id Senior|id senior") ~ "Mid-Senior level",
str_detect(job_title, "irector") ~ "Director",
str_detect(job_title, "xecutive") ~ "Executive",
TRUE ~ NA_character_
)
) %>%
dplyr::select(
job_title,
employer,
location,
size,
description,
seniority,
employment_type,
industry,
sector
)
###################################################
## all job offers worldwide for global map
df_world <-
df_li %>%
dplyr::select(
job_title,
employer,
location,
size,
description,
seniority,
employment_type,
industry,
"sector" = job_functions
) %>%
full_join(df_gd_join) %>%
mutate(
description = str_replace_all(description, "\\r", " "),
description = str_replace_all(description, "\\n", " "),
description = str_replace_all(description, "\\s+", " ")
) %>%
distinct(job_title, employer, location, description, .keep_all = TRUE)
###################################################
## only keep pages with language == EN
df_li_en <-
df_li %>%
## filter based on
filter(
str_detect(url, "^https://www.") |
str_detect(url, "^https://au.") |
#str_detect(url, "^https://be.") |
str_detect(url, "^https://ca.") |
#str_detect(url, "^https://gh.") |
#str_detect(url, "^https://gr.") |
str_detect(url, "^https://ie.") |
#str_detect(url, "^https://il.") |
#str_detect(url, "^https://in.") |
#str_detect(url, "^https://mg.") |
#str_detect(url, "^https://ng.") |
#str_detect(url, "^https://ph.") |
#str_detect(url, "^https://sg.") |
#str_detect(url, "^https://ua.") |
#str_detect(url, "^https://vn.") |
str_detect(url, "^https://za.")
) %>%
mutate(
country = str_sub(url, start = 1, end = 11),
country = str_remove(country, "https://"),
country = str_remove(country, "\\.")
)
## join data
df_en <-
df_li_en %>%
dplyr::select(
job_title,
employer,
location,
size,
description,
seniority,
employment_type,
industry,
"sector" = job_functions
) %>%
mutate(
sector = case_when(
str_detect(sector, "Business") ~ "Business Services",
str_detect(sector, "Information Technology") ~ "Information Technology",
str_detect(sector, "Consume") ~ "Consumer Services",
str_detect(sector, "Health") ~ "Health Care",
str_detect(sector, "Educat") ~ "Education",
str_detect(sector, "Retail") ~ "Retail",
str_detect(sector, "Insurance") ~ "Insurance",
str_detect(sector, "Media") ~ "Media",
str_detect(sector, "Manufact") ~ "Manufacturing",
str_detect(sector, "Account|Legal") ~ "Accounting & Legal",
str_detect(sector, "Travel|Tourism") ~ "Travel & Tourism",
str_detect(sector, "Biotech|Pharma") ~ "Biotech & Pharmaceuticals",
str_detect(sector, "Restaurant|Bar|Food") ~ "Restaurants, Bars & Food Services",
str_detect(sector, "Transport|Logistic") ~ "Transportation & Logistics",
str_detect(sector, "Construct|Repair|Maintenance") ~ "Construction, Repair & Maintenance",
str_detect(sector, "Finance") ~ "Finance",
str_detect(sector, "Oil|Gas|Energy|Utilit") ~ "Oil, Gas, Energy & Utilities",
str_detect(sector, "ArtsEntertain|Recreat") ~ "Arts, Entertainment & Recreation",
str_detect(sector, "Telecom") ~ "Telecommunications",
str_detect(sector, "Gov") ~ "Government",
TRUE ~ NA_character_
)
) %>%
full_join(df_gd_join) %>%
mutate(
description = str_replace_all(description, "\\r", " "),
description = str_replace_all(description, "\\n", " "),
description = str_replace_all(description, "\\s+", " ")
) %>%
distinct(job_title, employer, location, .keep_all = TRUE)
#distinct(job_title, employer, location, description, .keep_all = TRUE) ## description often slightly different (diff: 111 offers)We use two data sets:
The LinkedIn data contain global job offers while the GlassDoor data only jobs from the US. The LinkedIn data including only job offers with the term SEO (or seo) contain 2,387 observations from English-speaking countries (USA, Canada, UK, Australia, Ireland, South Africa) and 552 from the USA and the UK (links starting with www.linkedin.com).
We merged both data sets and kept as many variables as possible, manually creating new variables for both datasets (GlassDoor: seniority and employment type; LinkedIn: sector) based on text matching of job titles and descriptions. We also removed as many duplictaed entries as possible by matching job title, employer and job location. The final worldwide data set contains 3,127 observations.
Because the job offers are collected from all over the world, a lot of foreign terms are included. Thus, we merged the GlassDoor data also with the English subset of the LinkedIn data and kept again as many variables as possible by manually creating new variables for both data sets. The final “All English” data set contains 1,344 observations.
The GlassDoor data are cleaner with regard to job titles and description than the LinkedIn data. Consequently, some plots using the GlassDoor data do a better job so we provide for now both version (the merged “All English” data set and the GlassDoor data set).
Also, the GlassDoor data contain information that are missing from the LinkedIn data such as estimated salary range, rating, employer, industry, and size (no. of employees).
We analysed the data on job titles using text mining techniques. In a first step, we tokenize the job titles into single words and visualize their frequency. Stop words and words that appeared less than 7 times were removed to make the graph easier to grasp.
df_en %>%
unnest_tokens(word, job_title, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
filter(
n >= 25,
!word %in% c("seo")
) %>%
mutate(
word = if_else(
word %in% c("sr", "ppc", "sem"),
str_to_upper(word),
str_to_title(word)
)
) %>%
ggplot(aes(fct_reorder(word, n), n)) +
geom_col(fill = bl_col, width = .8) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "white",
fontface = "bold",
size = 2.7,
hjust = 1,
nudge_y = -5
) +
coord_flip() +
scale_x_discrete(expand = c(.025, .025)) +
scale_y_continuous(expand = c(.005, .005)) +
labs(
x = NULL,
y = "Term frequence in job titles",
caption = 'Note: Only words with a frequency of 25 or more shown. The term "SEO" was removed.'
) +
theme_flipif(save == T){
ggsave(here::here("plots", "1_1_jobs_word.pdf"), width = 12, height = 8, device = cairo_pdf)
}In a second step, we analysed sequences of words in the job title. The sorted bar plot shows the most popular consecutive sequences of words (5 or more occurences), colored by category.
df_job_ngrams_en <-
df_en %>%
unnest_tokens(word, job_title, token = "ngrams") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
filter(
n >= 10,
!is.na(word),
word != "search engine optimization"
) %>%
mutate(
group = case_when(
str_detect(word, "analyst") ~ "Analyst",
str_detect(word, "content") ~ "Content Writer",
str_detect(word, "manager") ~ "Management",
str_detect(word, "market") ~ "Marketing",
str_detect(word, "special") ~ "Specialist",
str_detect(word, "strategist") ~ "Strategy",
str_detect(word, "executive|head") ~ "Executive",
TRUE ~ "Other"
),
word = str_to_title(word),
word = str_replace(word, "Seo", "SEO"),
word = str_replace(word, "Sem", "SEM"),
word = str_replace(word, "Sr", "SR"),
word = str_replace(word, "Ppc", "PPC"),
word = factor(word)
) %>%
mutate(group = fct_relevel(group, "Other", after = Inf))
ggplot(
df_job_ngrams_en,
aes(fct_reorder(word, n), n)) +
geom_col(
aes(fill = group),
width = .8
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "white",
fontface = "bold",
size = 3,
hjust = 1,
nudge_y = -.6
) +
coord_flip() +
scale_y_continuous(
expand = c(.01, .01),
limits = c(0, 33)
) +
scale_fill_manual(
name = "Job Category",
values = c("#2d6db4", "#94346E", "#e49e2e", "#633b96", "#96633b", "#00b877", "#cc503e", "grey50")
) +
guides(fill = guide_legend(ncol = 1)) +
labs(
x = NULL,
y = "Frequence of word sequence in job titles",
caption = 'Note: Only sequences with a frequency of 10 or more shown. The sequence "Search Engine Optimization" was removed.'
) +
theme_flip +
theme(
legend.position = c(.76, .45),
legend.key.size = unit(3, "pt"),
legend.key.width = unit(30, "pt")
)if(save == T){
ggsave(here::here("plots", "1_1_jobs_cat.pdf"), width = 12, height = 7.5, device = cairo_pdf)
}We manually classified in technical and non-technical positions, removing all words that are no specific to any of the both categories. This modified stacked bar plot shows the number of words found per job category and, additionally as another stacked bar next to it, the most common words per category (with labels for words that occured at least 20 times). The height of the stacks indicates as well the number, the width is arbitrary.
## technical: analyst, engineer, developer, technician, optimization,
## non-technical: manager, director, writer, consultant, coordinator, editor, marketing, sales, social media
df_stack <-
df_en %>%
unnest_tokens(word, job_title, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
mutate(
type = case_when(
str_detect(word, "analy|develop|technic|data") ~ "technical",
str_detect(word, "manage|direct|writ|consult|coordinat|edito|market|sale|social|strateg|supervis") ~ "non-technical",
TRUE ~ "unknown"
)
) %>%
filter(type != "unknown") %>%
group_by(type) %>%
mutate(sum = sum(n)) %>%
ungroup %>%
arrange(-sum, n) %>%
mutate(
pos = cumsum(n),
pos = if_else(!is.na(lag(pos)), pos - ((pos - lag(pos)) / 2), pos / 2)
) %>%
mutate(type = fct_reorder(factor(type), sum)) %>%
mutate(country = fct_reorder2(factor(word), as.numeric(type), n, .desc = F)) %>%
group_by(type) %>%
arrange(n) %>%
mutate(
alpha = n / max(n),
pos_cont = min(pos) + (max(pos) - min(pos)) / 2
)
df_stack %>%
ggplot(aes(1, n)) +
geom_col(
aes(
fill = type,
fill = after_scale(colorspace::darken(fill, .05)),
color = type,
color = after_scale(colorspace::darken(color, .15)),
alpha = alpha
),
size = .1
) +
geom_col(
aes(
fill = type,
color = type,
fill = after_scale(colorspace::darken(fill, .3)),
color = after_scale(colorspace::darken(color, .3))
),
width = .4,
size = .1
) +
geom_rect(
xmin = -Inf,
xmax = .8,
ymin = -Inf,
ymax = Inf,
fill = "white"
) +
geom_text(
data = df_stack %>% group_by(type) %>% summarize(pos_cont = unique(pos_cont)),
aes(
x = 1,
y = pos_cont + 35,
label = glue::glue("{type}\ntitle terms")
),
family = "Montserrat",
fontface = "bold",
color = "white",
size = 8,
lineheight = .9,
hjust = .5,
vjust = 0
) +
geom_text(
data = df_stack %>%
group_by(type) %>%
summarize(
pos_cont = unique(pos_cont),
sum = format(unique(sum), big.mark = ",")
),
aes(
x = 1,
y = pos_cont,
label = sum
),
family = "Overpass",
color = "white",
size = 10,
lineheight = .9,
hjust = .5,
vjust = .5
) +
geom_text(
data = df_stack %>% filter(n >= 10),
aes(
x = 1.47,
y = pos,
label = word,
color = type,
color = after_scale(colorspace::darken(color, .15)),
size = n
),
family = "Montserrat",
fontface = "bold",
hjust = 0
) +
geom_text(
data = df_stack %>% filter(n >= 10),
aes(
x = 1.33,
y = pos,
label = n,
size = n / 2
),
family = "Overpass",
color = "white",
hjust = .5
) +
scale_x_continuous(limits = c(0.5, 2.15)) +
scale_color_manual(
values = c(bl_col, "#8800d1"),
guide = F
) +
scale_fill_manual(
values = c(bl_col, "#8800d1"),
guide = F
) +
scale_alpha(
range = c(.3, 1),
guide = F
) +
scale_size(
range = c(1.5, 15),
guide = F
) +
theme_void()## world map
sf_world <-
st_as_sf(rworldmap::getMap(resolution = "high")) %>%
st_transform(crs = "+proj=cea +lon_0=0 +lat_ts=30 +x_0=0 +y_0=0 +datum=WGS84 +ellps=WGS84 +units=m +no_defs") %>%
dplyr::select(ISO_A3, continent)
## US states
sf_states <-
usa_sf() %>%
filter(!name %in% c("Alaska", "Hawaii")) %>%
st_transform(2163)
## US counties
sf_counties <-
counties_sf() %>%
filter(!state %in% c("Alaska", "Hawaii")) %>%
st_transform(2163)## add geocodes of cities worldwide
path <- here::here("proc_data", "geocodes_world.Rds")
df_loc_world <-
df_world %>%
group_by(location) %>%
count(sort = T) %>%
ungroup()
## returns only addresses from North America
# if(file.exists(path)){
# ## load geocodes or...
# df_geo_world <- readRDS(path)
# }else{
# ## grab and save geocodes
# df_geo_world <-
# gio_batch_geocode(df_loc_world$location) %>%
# unnest(response_results, .preserve = response_warnings) %>%
# dplyr::select(query, formatted_address, location.lat, location.lng) %>%
# group_by(query) %>%
# slice(1)
#
# saveRDS(df_geo_world, path)
# }
## billing needed!
#df_geo <- mutate_geocode(df_loc_world, location)
# sf_map_world <-
# df_loc_world
# left_join(df_geo_world, by = c("location" = "query")) %>%
# filter(!is.na(location.lng)) %>%
# mutate(location.lng = location.lng * 10^5, location.lat = location.lat * 10^5) %>%
# st_as_sf(coords = c("location.lng", "location.lat"),
# crs = st_crs(world)) %>%
# st_transform(st_crs(world)) ## locations English-speaking countries
df_loc_en <-
df_en %>%
group_by(location) %>%
count(sort = T) %>%
ungroup()
## add geocodes of cities in English-speaking countires
path <- here::here("proc_data", "geocodes_en.Rds")
if(file.exists(path)){
## load geocodes or...
df_geo_en <- readRDS(path)
}else{
## grab and save geocodes
df_geo_en <-
gio_batch_geocode(df_loc_en$location) %>%
unnest(response_results, .preserve = response_warnings) %>%
dplyr::select(query, formatted_address, location.lat, location.lng) %>%
group_by(query) %>%
slice(1)
saveRDS(df_geo_en, path)
}
## map data North America
sf_map_us <-
df_loc_en %>%
left_join(df_geo_en, by = c("location" = "query")) %>%
filter(
!is.na(location.lng),
!str_detect(location, "United Kingdom"),
!str_detect(location, "South Africa"),
!str_detect(location, "Australia"),
!str_detect(location, "Ireland"),
!str_detect(location, "Canada")
) %>%
st_as_sf(coords = c("location.lng", "location.lat"),
crs = 4326) %>%
st_transform(st_crs(sf_states)) %>%
st_crop(st_bbox(sf_states))
## long version with 1 row per offer to count spatially
sf_map_us_long <-
df_loc_en %>%
group_by(location) %>%
expand(n = seq(1:n)) %>%
left_join(df_geo_en, by = c("location" = "query")) %>%
filter(
!is.na(location.lng),
!str_detect(location, "United Kingdom"),
!str_detect(location, "South Africa"),
!str_detect(location, "Australia"),
!str_detect(location, "Ireland"),
!str_detect(location, "Canada")
) %>%
st_as_sf(coords = c("location.lng", "location.lat"),
crs = 4326) %>%
st_transform(st_crs(sf_states)) %>%
st_crop(st_bbox(sf_states))# sf_map_world %>%
# arrange(-n) %>%
# ggplot() +
# geom_sf(data = world,
# #color = colorspace::darken(bl_col, .05),
# color = "grey60",
# alpha = .3,
# lwd = 5) +
# geom_sf(data = world,
# color = "white",
# fill = "#cedbd7",
# lwd = .5) +
# geom_sf(aes(size = n),
# color = "white",
# show.legend = "point") +
# geom_sf(aes(size = n),
# shape = 21,
# color = colorspace::darken(bl_col, .2),
# fill = NA,
# stroke = .4,
# show.legend = "point") +
# geom_sf(aes(size = n, color = n),
# color = bl_col,
# alpha = .1,
# show.legend = "point") +
# scale_size(range = c(3, 30),
# breaks = c(1, 10, 25, 50, 100),
# name = "Number of job offers") +
# guides(size = guide_legend(title.position = "top",
# title.hjust = .5,
# nrow = 1,
# label.position = "bottom",
# override.aes = list(shape = 21, color = bl_col, fill = colorspace::lighten(bl_col, .9), stroke = 1))) +
# theme_map +
# theme(legend.position = c(.2, .1),
# legend.title = element_text(margin = margin(0, 0, -5, 0)),
# legend.text = element_text(margin = margin(-10, 0, 0, 0)))
#
# if(save == T){
# ggsave(here::here("plots", "2_1_map_world_cities.pdf"),
# width = 15, height = 9.7, device = cairo_pdf)
# }sf_map_us %>%
arrange(-n) %>%
ggplot() +
# geom_sf(
# data = world,
# #color = colorspace::darken(bl_col, .05),
# color = "grey60",
# alpha = .2,
# lwd = 3
# ) +
geom_sf(
data = sf_world,
color = "grey60",
fill = colorspace::lighten("#374b45", .15),
lwd = .4
) +
geom_sf(
data = sf_world %>% filter(ISO_A3 == "USA"),
color = "grey60",
fill = "#374b45",
lwd = .4
) +
geom_sf(
aes(size = n),
color = "white",
show.legend = "point"
) +
geom_sf(
aes(size = n),
shape = 21,
color = colorspace::darken(bl_col, .2),
fill = NA,
stroke = .1,
show.legend = "point"
) +
geom_sf(
aes(
size = n,
color = n
),
color = bl_col,
alpha = .1,
show.legend = "point"
) +
coord_sf(
xlim = c(-2300000, 2900000),
ylim = c(-2200000, 1000000)
) +
scale_size(
range = c(1, 25),
breaks = c(1, 10, 50, 100),
name = "Number of\njob offers"
) +
guides(
size = guide_legend(
title.position = "top",
title.hjust = .5,
nrow = 1,
label.position = "bottom",
override.aes = list(
shape = 21,
color = bl_col,
fill = colorspace::lighten(bl_col, .9),
stroke = 1
)
)
) +
theme_map +
theme(
legend.position = c(.1, .12),
legend.title = element_text(color = "grey90", margin = margin(0, 0, -5, 0)),
legend.text = element_text(color = "grey90", margin = margin(-5, 0, 0, 0)),
panel.background = element_rect(fill = "grey30"),
panel.border = element_rect(color = "grey60", fill = "transparent", size = 4.5),
panel.grid.major = element_line(color = "grey45", size = .4, linetype = "dashed"),
plot.margin = margin(0, 0, 0, 0)
)sf_map_us %>%
arrange(-n) %>%
ggplot() +
geom_sf(
data = sf_states,
#color = colorspace::darken(bl_col, .05),
color = "grey60",
alpha = .3,
lwd = 5
) +
geom_sf(
data = sf_states,
color = "#cedbd7",#"white",
fill = colorspace::darken("#cedbd7", .15, space = "HLS"),
lwd = .5
) +
geom_sf(
aes(size = n),
color = "white",
show.legend = "point"
) +
geom_sf(
aes(size = n),
shape = 21,
color = colorspace::darken(bl_col, .2),
fill = NA,
stroke = .4,
show.legend = "point"
) +
geom_sf(
aes(size = n, color = n),
color = bl_col,
alpha = .1,
show.legend = "point"
) +
scale_size(
range = c(3, 30),
breaks = c(1, 10, 25, 50, 100),
name = "Number of job offers"
) +
guides(
size = guide_legend(
title.position = "top",
title.hjust = .5,
nrow = 1,
label.position = "bottom",
override.aes = list(
shape = 21,
color = bl_col,
fill = colorspace::lighten(bl_col, .9),
stroke = 1
)
)
) +
theme_map +
theme(
legend.position = c(.2, .1),
legend.title = element_text(margin = margin(0, 0, -5, 0)),
legend.text = element_text(margin = margin(-10, 0, 0, 0))
)## centroids for labels
sf_states_count <-
sf_states %>%
mutate(
pt_count = lengths(st_intersects(sf_states, sf_map_us_long)),
pt_count = if_else(pt_count == 0, NA_integer_, pt_count)
)
centroids <-
sf_states_count %>%
st_centroid()
sf_states_count %>%
ggplot() +
geom_sf(
aes(fill = pt_count),
color = "grey80",
size = .6
) +
#geom_sf(data = sf_map_en_long) +
rcartocolor::scale_fill_carto_c(
palette = "Emrld",
na.value = "grey96",
breaks = c(1, seq(25, 200, by = 25)),
name = "Number of job offers"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", pt_count < 100),
aes(label = pt_count),
family = "Overpass",
fontface = "bold"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", pt_count >= 100),
aes(label = pt_count),
family = "Overpass",
fontface = "bold",
color = "white"
) +
geom_sf_text(
data = sf_states_count %>% filter(name == "Maryland"),
aes(label = pt_count),
family = "Overpass",
fontface = "bold",
nudge_y = 35000
) +
guides(fill = guide_colorbar(
title.position = "top",
title.hjust = .5,
label.position = "bottom",
barwidth = unit(30, "lines"),
barheight = unit(.6, "lines"))
) +
theme_map +
theme(legend.position = c(.5, .95))if(save == T){
ggsave(here::here("plots", "2_2_map_states_chloro.pdf"),
width = 15, height = 9.7, device = cairo_pdf)
}(This map is derived from spatial locations by intersecting cities with state polygons - thus slightly different numbers compared to the hexagonal grid map which uses states as stated by the source.)
df_states_join <-
df_loc_en %>%
mutate(location = str_replace(location, ", United States|, US", "")) %>%
mutate(state_mixed = str_extract(location, "[^, ]*$"))
sf_states_count <-
sf_states %>%
left_join(df_states_join, by = c("iso_3166_2" = "state_mixed")) %>%
filter(!is.na(geo_id)) %>%
group_by(name) %>%
summarize(n = sum(n, na.rm = T)) %>%
mutate(n = if_else(n == 0, NA_integer_, n)) %>%
ungroup()
## centroids for labels
centroids <-
sf_states_count %>%
st_centroid()
sf_states_count %>%
ggplot() +
geom_sf(
aes(fill = n),
color = "grey80",
size = .6
) +
#geom_sf(data = sf_map_en_long) +
rcartocolor::scale_fill_carto_c(
palette = "Emrld",
na.value = "grey96",
breaks = c(1, seq(25, 200, by = 25)),
name = "Number of job offers"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", n < 100),
aes(label = n),
family = "Overpass",
fontface = "bold"
) +
geom_sf_text(
data = sf_states_count %>% filter(name != "Maryland", n >= 100),
aes(label = n),
family = "Overpass",
fontface = "bold",
color = "white"
) +
geom_sf_text(
data = sf_states_count %>% filter(name == "Maryland"),
aes(label = n),
family = "Overpass",
fontface = "bold",
nudge_y = 35000
) +
guides(fill = guide_colorbar(
title.position = "top",
title.hjust = .5,
label.position = "bottom",
barwidth = unit(30, "lines"),
barheight = unit(.6, "lines"))
) +
theme_map +
theme(legend.position = c(.5, .95))## data by states
df_states <-
readr::read_csv(here::here("raw_data", "50_us_states_world_data.csv"),
col_names = F) %>%
dplyr::select(state = "X2", ISO2 = "X3")%>%
add_row(state = "District of Colombia", ISO2 = "DC")
df_states_join <-
df_loc_en %>%
mutate(location = str_replace(location, ", United States|, US", "")) %>%
mutate(state_mixed = str_extract(location, "[^, ]*$")) %>%
left_join(df_states, by = c("state_mixed" = "ISO2")) %>%
mutate(state = if_else(is.na(state), state_mixed, state)) %>%
group_by(state) %>%
summarize(n = sum(n, na.rm = T)) %>%
ungroup()
## hex map
map_hex <- geojson_read(here::here("raw_data", "us_states_hexgrid.geojson.json"), what = "sp")
map_hex@data <-
map_hex@data %>%
mutate(google_name = gsub(" \\(United States\\)", "", google_name))
map_hex_fortified <- tidy(map_hex, region = "google_name")
## combine data
df_hex <-
map_hex_fortified %>%
left_join(df_states_join, by = c("id" = "state"))
## centroids for labels
centroids <- cbind.data.frame(data.frame(gCentroid(map_hex, byid = T),
id = map_hex@data$iso3166_2,
id_long = map_hex@data$google_name,
id_wrap = str_wrap(map_hex@data$google_name, 12))) %>%
left_join(df_hex, by = c("id_long" = "id")) %>%
group_by(id) %>%
slice(1) %>%
replace_na(list(n = 0))df_hex %>%
replace_na(list(n = 0)) %>%
ggplot() +
geom_polygon(
aes(
long, lat,
group = group,
fill = n
),
color = colorspace::darken(bl_col, .1),
#color = "grey60",
lwd = .8
) +
geom_text(
data = centroids,
aes(
x = x,
y = y + 0.35,
label = n,
alpha = n
),
family = "Montserrat",
fontface = "bold",
size = 7.5,
color = colorspace::darken(bl_col, .6),
vjust = .2
) +
geom_text(
data = centroids,
aes(
x = x,
y = y - 0.3,
label = id_wrap
),
family = "Montserrat",
fontface = "bold",
size = 3.2,
lineheight = 0.85,
vjust = 1
) +
coord_map() +
scale_fill_gradient(
low = "grey95",
high = colorspace::darken(bl_col, .1),
name = "Number of job offers",
limits = c(0, 200),
breaks = seq(0, 200, by = 25)
) +
scale_alpha(
range = c(.25, 1),
guide = F
) +
guides(fill = guide_colorbar(
barheight = unit(5, units = "mm"),
barwidth = unit(150, units = "mm"),
direction = "horizontal",
ticks.colour = "#e8d8c3",
title.position = "top",
title.hjust = 0.5)
) +
theme_map +
theme(
legend.position = c(.5, .9),
legend.text = element_text(size = 14)
)sf_counties %>%
mutate(
pt_count = lengths(st_intersects(sf_counties, sf_map_us_long)),
## note indicates max value of New York since too small to see
pt_count = if_else(name == "New York", 60L, pt_count),
pt_count = if_else(pt_count == 0, NA_integer_, pt_count),
) %>%
ggplot() +
geom_sf(
aes(fill = pt_count),
color = "grey80",
size = .2
) +
geom_sf(
data = sf_states,
fill = NA,
color = "grey50",
size = .3
) +
#geom_sf(data = sf_map_en_long) +
geom_segment(
x = 1600000,
xend = 2142000,
y = 250000,
yend = -125000,
arrow = arrow(length = unit(0.015, "npc")),
color = "grey40",
size = .5
) +
geom_text(
x = 1600000,
y = 276000,
label = "New York:\n155 job offers",
family = "Montserrat",
color = "grey40",
size = 3.5,
lineheight = .9,
vjust = 0
) +
rcartocolor::scale_fill_carto_c(
palette = "Emrld",
na.value = "grey96",
name = "Number of job offers",
breaks = c(1, seq(10, 60, by = 10)),
limits = c(NA, 60)
) +
guides(
fill = guide_colorbar(
title.position = "top",
title.hjust = .5,
label.position = "bottom",
barwidth = unit(30, "lines"),
barheight = unit(.6, "lines")
)
) +
theme_map +
theme(legend.position = c(.5, .95))if(save == T){
ggsave(here::here("plots", "2_3_map_counties_chloro.pdf"),
width = 15, height = 9.7, device = cairo_pdf)
}(This map is derived from spatial locations by intersecting cities with county polygons - thus there might be small differences compared to the state-level maps.)
df_size <-
df_gd %>%
dplyr::select(employer, size, description) %>%
mutate(
size = if_else(is.na(size), "Unknown", size),
size = str_replace(size, " employees", ""),
size = str_replace(size, " to ", "–"),
size = factor(size,
levels = c("1–50", "51–200", "201–500", "501–1000",
"1001–5000", "5001–10000", "10000+", "Unknown"),
labels = c("1–50", "51–200", "201–500", "501–1,000",
"1,001–5,000", "5,001–10,000", "10,000+", "Unknown"),
)
)-> Counts of unique companies per size class
df_size %>%
group_by(size) %>%
summarize(n = n_distinct(employer)) %>%
mutate(col = if_else(size == "Unknown", "A", "B")) %>%
ggplot(aes(size, n)) +
geom_col(
aes(
fill = col,
color = after_scale(colorspace::darken(fill, .4))
),
width = .85,
size = 1
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, 195)
) +
scale_fill_manual(
values = c("grey60", bl_col),
guide = F
) +
labs(
x = "Number of employees",
y = "Number of companies"
) +
theme(panel.grid.major.x = element_blank())if(save == T){
ggsave(here::here("plots", "3_1_size_histo.pdf"),
width = 12, height = 8, device = cairo_pdf)
}CED: What do we define as “more specialized tasks”?
df_revenue <-
df_gd %>%
dplyr::select(revenue, employer, description) %>%
mutate(
revenue = if_else(revenue == "Unknown / Non-Applicable", "Unknown", revenue),
revenue = if_else(is.na(revenue), "Unknown", revenue),
revenue = str_replace(revenue, " \\(USD\\)", ""),
revenue = str_replace(revenue, " million", "M"),
revenue = str_replace(revenue, " billion", "B"),
revenue = str_replace(revenue, " to \\$", "–"),
revenue = str_replace(revenue, "Less than", "<"),
revenue = if_else(revenue == "$10+B", "> $10B", revenue),
revenue = factor(revenue,
levels = c("< $1M", "$1–5M", "$5–10M", "$10–25M",
"$25–50M", "$50–100M", "$100–500M", "$500M–1B",
"$1–2B", "$2–5B", "$5–10B", "> $10B",
"Unknown"))
)-> Counts of unique companies per revenue class
df_revenue %>%
group_by(revenue) %>%
summarize(n = n_distinct(employer)) %>%
filter(revenue != "Unknown") %>%
ggplot(aes(revenue, n)) +
geom_col(
fill = bl_col,
color = after_scale(colorspace::darken(bl_col, .4, space = "HLS")),
width = .8,
size = 1
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, NA)
) +
labs(
x = "Number of employees",
y = "Revenue"
) +
theme(
panel.grid.major.x = element_blank(),
axis.text.x = element_text(size = 10)
)if(save == T){
ggsave(here::here("plots", "3_2_revenue_histo.pdf"),
width = 12, height = 8, device = cairo_pdf)
}I tokenized the description and removed stop words and numbers as well as manually non-sense/non-skill-related words. There might be more but if we keep it we can have a closer look I would say.
df_rev_skills <-
df_revenue %>%
unnest_tokens(word, description, token = "words") %>%
anti_join(stop_words) %>%
filter(
!word %in% c("seo","experience", "skills", "skill", "ability", "requirement", "requirements", "link", "required", "provide", "day", "email", "based", "growth", "recommendations", "employment", "paid", "payment", "key", "gender", "equal", "applicants", "application", "disability", "disabilities", "world", "qualifications", "multiple", "page", "pages", "related", "site", "candidate", "insurance", "company", "agency", "office", "position"),
!str_detect(word, "[0-9+]")
) %>%
mutate(word = if_else(word == "teams", "team", word))
set.seed(2020)
cloud_low <-
df_rev_skills %>%
filter(revenue %in% c("< $1M", "$1–5M", "$5–10M", "$10–25M",
"$25–50M", "$50–100M")) %>%
count(word, sort = T) %>%
top_n(75, n) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square",
grid_margin = 2.5
) +
rcartocolor::scale_color_carto_c(palette = "Emrld") +
scale_size_area(max_size = 7) +
labs(title = "\nCompanies with revenues lower than $100M") +
theme_minimal()
cloud_high <-
df_rev_skills %>%
filter(revenue %in% c("$100–500M", "$500M–1B",
"$1–2B", "$2–5B", "$5–10B", "> $10B")) %>%
count(word, sort = T) %>%
top_n(75, n) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square",
grid_margin = 2.5
) +
rcartocolor::scale_color_carto_c(palette = "Emrld") +
scale_size_area(max_size = 7) +
labs(title = "\nCompanies with revenues higher than $100M") +
theme_minimal()
## vertical allignment
(cloud_low / cloud_high) *
theme(plot.title = element_text(family = "Montserrat", size = 14, face = "bold",
hjust = .5, margin = margin(b = -30)))if(save == T){
ggsave(here::here("plots", "3_2_revenue_words_vertical.pdf"),
width = 11, height = 8, device = cairo_pdf)
}
## horizontal allignment
(cloud_low + cloud_high) *
theme(plot.title = element_text(family = "Montserrat", size = 14, face = "bold",
hjust = .5, margin = margin(b = -15)))df_gd %>%
filter(!is.na(sector)) %>%
group_by(sector) %>%
count(sort = T) %>%
ungroup() %>%
mutate(sector = fct_reorder(factor(sector), n)) %>%
ggplot(aes(sector, n)) +
geom_col(
width = .8,
fill = bl_col
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "grey40",
fontface = "bold",
hjust = 0,
nudge_y = 3,
size = 2.7
) +
coord_flip() +
scale_y_continuous(
expand = c(.01, .01),
limits = c(0, 345),
breaks = seq(0, 300, by = 50)
) +
labs(
x = NULL,
y = "Number of job offers by sector"
) +
theme_flipif(save == T){
ggsave(here::here("plots", "3_3_sector_counts.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}df_gd %>%
filter(!is.na(industry)) %>%
group_by(industry) %>%
count(sort = T) %>%
filter(n > 4) %>%
ungroup() %>%
mutate(industry = fct_reorder(factor(industry), n)) %>%
ggplot(aes(industry, n)) +
geom_col(
width = .8,
fill = bl_col
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = "grey40",
fontface = "bold",
hjust = 0,
nudge_y = 3,
size = 2.7
) +
coord_flip() +
scale_y_continuous(
expand = c(.01, .01),
limits = c(0, 265),
breaks = seq(0, 250, by = 50)
) +
labs(
x = NULL,
y = "Number of job offers by industry",
caption = "Note: Only industries with 5 or more job offers shown."
) +
theme_flip-> Counts of unique companies per rating
df_gd %>%
group_by(ratings) %>%
summarize(n = n_distinct(employer)) %>%
ggplot(aes(ratings, n)) +
geom_segment(
aes(
xend = ratings,
yend = 0
),
color = colorspace::darken(bl_col, .2),
size = 1.5
) +
geom_point(
fill = bl_col,
color = colorspace::darken(bl_col, .2),
shape = 21,
size = 5,
stroke = .5
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, 40)
) +
labs(
x = "Number of employees",
y = "Number of companies"
) +
theme(panel.grid.major.x = element_blank())if(save == T){
ggsave(here::here("plots", "3_4_rating_lolli.pdf"),
width = 12, height = 8, device = cairo_pdf)
}df_gd %>%
ungroup() %>%
mutate(rating_floor = floor(ratings - .01)) %>%
group_by(rating_floor) %>%
summarize(n = n_distinct(employer)) %>%
ggplot(aes(rating_floor, n)) +
geom_col(
fill = bl_col,
color = colorspace::darken(bl_col, .2)
) +
scale_x_continuous(
breaks = 0:4,
labels = c("< 1", "1–2", "2–3", "3–4", "> 4")
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, NA)
) +
labs(
x = "Rating range",
y = "Number of companies"
) +
theme(panel.grid.major.x = element_blank())df_tasks_words <-
df_gd %>%
unnest_tokens(word, description, token = "words") %>%
anti_join(stop_words) %>%
count(word, sort = T) %>%
filter(
nchar(word) > 2,
!str_detect(word, "[0-9]+")
)
df_tasks_words %>%
filter(
word != "seo",
n >= 10
) %>%
ggplot(
aes(
label = word,
size = n,
color = n
)
) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square",
grid_margin = 3
) +
rcartocolor::scale_color_carto_c(palette = "ag_Sunset") +
scale_size_area(max_size = 20) +
theme_minimal()if(save == T){
ggsave(here::here("plots", "4_cloud_word.pdf"),
width = 25, height = 15, device = cairo_pdf)
}
df_tasks_sequ <-
df_gd %>%
unnest_tokens(word, description, token = "ngrams") %>%
anti_join(stop_words) %>%
count(word, sort = T)
df_tasks_sequ %>%
filter(
word != "seo",
n >= 10
) %>%
ggplot(aes(label = word, size = n, color = n)) +
geom_text_wordcloud(
family = "Montserrat",
fontface = "bold",
shape = "square"
) +
rcartocolor::scale_color_carto_c(palette = "ag_Sunset") +
scale_size_area(max_size = 20) +
theme_minimal()df_size_edu <-
df_size %>%
mutate(
bachelors = if_else(str_detect(description, "B.Ba.|B.Sc.|BBa|BSc|BBA|BSC|Bachelors"), 1, 0),
masters = if_else(str_detect(description, "M.Ba.|M.Sc.|MBa|MSc|MBA|MSC|Masters"), 1, 0),
doctorate = if_else(str_detect(description, "Ph.D.|PhD|Doctorate"), 1, 0)
) %>%
dplyr::select(size, bachelors, masters, doctorate) %>%
pivot_longer(
cols = c(bachelors, masters, doctorate),
names_to = "education",
values_to = "yes_no"
) %>%
group_by(size) %>%
mutate(
total = n(),
education = str_to_title(education),
education = factor(education, levels = c("Doctorate", "Masters", "Bachelors"))
)df_size_edu %>%
group_by(education) %>%
summarize(n = sum(yes_no)) %>%
ggplot(aes(education, n)) +
geom_col(
fill = bl_col,
color = after_scale(colorspace::darken(bl_col, .4, space = "HLS")),
width = .8,
size = 1.2
) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, 42)
) +
labs(
x = NULL,
y = "Number of mentions"
) +
theme(
panel.grid.major.x = element_blank(),
axis.text.x = element_text(size = 18)
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_histo.pdf"),
width = 12, height = 7.5, device = cairo_pdf)
}Bachelors: 38 Masters: 10 Doctorate: 1
df_size_edu %>%
group_by(size, education) %>%
summarize(rel = sum(yes_no) / unique(total)) %>%
ungroup() %>%
ggplot(
aes(
size, rel,
fill = education
)
) +
geom_col(width = .8) +
scale_y_continuous(
expand = c(0, 0),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank()
) +
labs(
x = "Number of employees",
y = "Percentage"
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_size.pdf"),
width = 12, height = 7.5, device = cairo_pdf)
}df_rev_edu <-
df_revenue %>%
mutate(
bachelors = if_else(str_detect(description, "B.Ba.|B.Sc.|BBa|BSc|BBA|BSC|Bachelors"), 1, 0),
masters = if_else(str_detect(description, "M.Ba.|M.Sc.|MBa|MSc|MBA|MSC|Masters"), 1, 0),
doctorate = if_else(str_detect(description, "Ph.D.|PhD|Doctorate"), 1, 0)
) %>%
dplyr::select(revenue, bachelors, masters, doctorate) %>%
pivot_longer(
cols = c(bachelors, masters, doctorate),
names_to = "education",
values_to = "yes_no"
) %>%
group_by(revenue) %>%
mutate(
total = n(),
education = str_to_title(education),
education = factor(education, levels = c("Doctorate", "Masters", "Bachelors"))
) %>%
group_by(revenue, education) %>%
summarize(rel = sum(yes_no) / unique(total)) %>%
ungroup()
ggplot(df_rev_edu,
aes(
revenue, rel,
fill = education)
) +
geom_col(width = .8) +
scale_x_discrete(guide = guide_axis(n.dodge = 2)) +
scale_y_continuous(
expand = c(.0005, .0005),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank()
) +
labs(
x = "Estimated revenue",
y = "Percentage"
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_revenue_dodge.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}
ggplot(df_rev_edu,
aes(
revenue, rel,
fill = education
)) +
geom_col(width = .8) +
scale_y_continuous(
expand = c(.0005, .0005),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(size = 9)
) +
labs(
x = "Estimated revenue",
y = "Percentage"
)if(save == T){
ggsave(here::here("plots", "5_1_require_edu_revenue_small.pdf"),
width = 12, height = 6.5, device = cairo_pdf)
}
ggplot(df_rev_edu, aes(revenue, rel,
fill = education)) +
geom_col(width = .8) +
scale_y_continuous(
expand = c(.0005, .0005),
limits = c(0, .031),
labels = scales::percent_format()
) +
scale_fill_manual(
values = cols_degree,
name = "Required degree:"
) +
guides(fill = guide_legend(reverse = T)) +
theme(
legend.text = element_text(size = 14),
panel.grid.major.x = element_blank(),
axis.text.x = element_text(angle = 22, hjust = 1, vjust = 1)
) +
labs(
x = "Estimated revenue",
y = "Percentage"
)I for now use the programming languages listed by the SO yearly survey: JavaScript, HTML/CSS, SQL, Python, Java, Bash/Shell/PowerShell, C#, PHP, C++, TypeScript, C, Ruby, Go, Assembly, Swift, Kotlin, R, VBA, Objective-C, Scala, Rust, Dart, Elixir, Clojure, WebAssembly + Julia
df_prog <-
df_gd %>%
mutate(
JavaScript = str_count(description, "\\bJavaScript\\b"),
HTML = str_count(description, " HTML\\b"),
CSS = str_count(description, "\\bCSS\\b"),
SQL = str_count(description, "\\bSQL\\b"),
Python = str_count(description, "\\bPython\\b"),
Java = str_count(description, "\\bJava\\b"),
Bash = str_count(description, "\\bBash\\b"),
Shell = str_count(description, "\\bShell\\b"),
Powershell = str_count(description, "\\bPowershell\\b"),
`C#` = str_count(description, "\\bC#\\b"),
PHP = str_count(description, "\\bPHP\\b"),
`C++` = str_count(description, "\\bC\\b\\+\\+| \\bCPP\\b"),
TypeScript = str_count(description, "\\bTypeScript\\b"),
C = str_count(description, "\\bC\\b"),
Ruby = str_count(description, "\\bRuby\\b"),
Go = str_count(description, "\\bGo\\b"),
Assembly = str_count(description, "\\bAssembly\\b"),
Swift = str_count(description, "\\bSwift\\b"),
Kotlin = str_count(description, "\\bKotlin\\b"),
R = str_count(description, "\\bR\\b"),
VBA = str_count(description, "\\bVBA\\b"),
ObjectiveC = str_count(description, "\\bObjective-C\\b|\\bObjective\\sC\\b"),
Scala = str_count(description, "\\bScala\\b"),
Rust = str_count(description, "\\bRust\\b"),
Dart = str_count(description, "\\bDart\\b"),
Elixir = str_count(description, "\\bElixir\\b"),
Clojure = str_count(description, "\\bClojure\\b"),
WebAssembly = str_count(description, "\\bWebAssembly\\b"),
Julia = str_count(description, "\\bJulia\\b")
) %>%
dplyr::select(JavaScript:Julia)df_prog %>%
pivot_longer(
cols = JavaScript:Julia,
names_to = "language",
values_to = "count"
) %>%
filter(count > 0) %>%
group_by(language) %>%
summarize(count = n()) %>%
ungroup() %>%
mutate(language = fct_reorder(language, count)) %>%
ggplot(aes(language, count)) +
geom_segment(
aes(
xend = language,
yend = 0
),
color = bl_col,
size = 3
) +
geom_point(
shape = 21,
color = bl_col,
fill = "white",
size = 10,
stroke = 1
) +
geom_text(
aes(label = count),
family = "Overpass Mono",
color = bl_col,
fontface = "bold",
size = 3.3,
hjust = .5,
nudge_y = 0
) +
coord_flip() +
scale_y_continuous(
expand = c(.02, .02),
limits = c(0, 340),
breaks = seq(0, 300, by = 50)
) +
theme_flip +
theme(axis.text.y = element_text(size = 14)) +
labs(
x = NULL,
y = "Number of job descriptions mentioning each programming language"
)if(save == T){
ggsave(here::here("plots", "5_2_require_prog.pdf"),
width = 12, height = 7, device = cairo_pdf)
}df_prog %>%
mutate(id = row_number()) %>%
pivot_longer(
cols = JavaScript:Julia,
names_to = "language",
values_to = "count"
) %>%
group_by(id) %>%
mutate(sum = sum(count)) %>%
filter(
sum > 1,
count > 0
) %>%
arrange(id) %>%
pivot_wider(
id_cols = id,
names_from = language,
values_from = count
) %>%
ungroup() %>%
dplyr::select(-id) %>%
mutate_all(funs(ifelse(. == 1, deparse(substitute(.)), NA))) %>%
unite("combination", sep = " + ", remove = T, na.rm = T) %>%
group_by(combination) %>%
count() %>%
ungroup() %>%
filter(
combination != "",
str_detect(combination, "\\+"),
n > 2
) %>%
mutate(combination = fct_reorder(combination, n)) %>%
ggplot(aes(combination, n)) +
geom_segment(
aes(
xend = combination,
yend = 0
),
color = bl_col,
size = 2
) +
geom_point(
shape = 21,
color = bl_col,
fill = "white",
size = 8,
stroke = 1
) +
geom_text(
aes(label = n),
family = "Overpass Mono",
color = bl_col,
fontface = "bold",
size = 3.5,
hjust = .5,
nudge_y = 0
) +
coord_flip() +
scale_y_continuous(
expand = c(.02, .02),
breaks = seq(0, 80, by = 10),
limits = c(0, 90)
) +
theme_flip +
labs(
x = NULL,
y = "Mentioned combinations of programming\nlanguages in job descriptions",
caption = 'Note: Only combinations with a frequency of 3 or more shown.'
)CED: List still needed (incl. tools like Power BI, Tableau etc)
–> Simply filter by “5+ years”? Or search for years and extract number? How to evaluate if they use different ways to say the same?
df_salary <-
df_gd %>%
dplyr::select(salary, employer, ratings) %>%
filter(!is.na(salary)) %>%
mutate(
salary_low = str_extract(salary, "\\$.*\\-"),
salary_high = str_extract(salary, "\\-\\$.*"),
) %>%
mutate_at(vars(matches("salary_")), ~str_extract(., "[0-9]+")) %>%
mutate_at(vars(matches("salary_")), as.numeric) %>%
mutate(
salary_avg = salary_low + salary_high / 2,
salary_class = salary_avg %/% 10 * 10
)df_salary %>%
ggplot(aes(x = salary_avg)) +
geom_histogram(
fill = bl_col,
color = colorspace::darken(bl_col, .4),
bins = 50
) +
geom_vline(
aes(xintercept = mean(salary_avg)),
linetype = "dashed",
color = "grey35",
size = 1
) +
annotate(
"text",
x = 90,
y = 47,
label = "Mean",
family = "Montserrat",
color = "grey35",
fontface = "bold",
size = 4.5
) +
scale_x_continuous(
limits = c(1, 300),
breaks = c(1, seq(50, 300, by = 50)),
labels = glue::glue("${c(1, seq(50, 300, by = 50))}K")
) +
scale_y_continuous(
expand = c(0, 0)
) +
scale_fill_manual(
values = c("grey60", bl_col),
guide = F
) +
labs(
x = "Average salary",
y = "Number of job offers"
) +
theme(panel.grid.major.x = element_blank())df_salary %>%
filter(!is.na(ratings)) %>%
ggplot(aes(ratings, salary_avg)) +
geom_smooth(
#method = "lm",
color = colorspace::darken(bl_col, .2),
se = F
) +
geom_point(
shape = 1,
size = 4,
color = bl_col,
stroke = .2
) +
geom_point(
size = 4,
alpha = .1,
color = bl_col
) +
scale_y_continuous(
breaks = seq(50, 300, by = 50),
labels = glue::glue("${seq(50, 300, by = 50)}K"),
limits = c(50, 300)
) +
labs(
x = "Company rating",
y = "Average salary (calculated from range)"
)if(save == T){
ggsave(here::here("plots", "6_5_rating_salary.pdf"),
width = 12, height = 7.5, device = cairo_pdf)
}## Converting page 1 to 1_1_jobs_cat_1.png... done!
## Converting page 1 to 1_1_jobs_tech_adj_1.png... done!
## Converting page 1 to 1_1_jobs_word_1.png... done!
## Converting page 1 to 2_1_map_northamerica_cities_1.png... done!
## Converting page 1 to 2_1_map_states_cities_1.png... done!
## Converting page 1 to 2_2_map_states_chloro_1.png... done!
## Converting page 1 to 2_2_map_states_chloro2_1.png... done!
## Converting page 1 to 2_2_map_states_hex_1.png... done!
## Converting page 1 to 2_3_map_counties_chloro_1.png... done!
## Converting page 1 to 3_1_size_histo_1.png... done!
## Converting page 1 to 3_2_revenue_histo_1.png... done!
## Converting page 1 to 3_2_revenue_words_horizontal_1.png... done!
## Converting page 1 to 3_2_revenue_words_vertical_1.png... done!
## Converting page 1 to 3_3_industry_counts_1.png... done!
## Converting page 1 to 3_3_sector_counts_1.png... done!
## Converting page 1 to 3_4_rating_histo_1.png... done!
## Converting page 1 to 3_4_rating_lolli_1.png... done!
## Converting page 1 to 4_cloud_sequ_1.png... done!
## Converting page 1 to 4_cloud_word_1.png... done!
## Converting page 1 to 5_1_require_edu_histo_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_angle_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_dodge_1.png... done!
## Converting page 1 to 5_1_require_edu_revenue_small_1.png... done!
## Converting page 1 to 5_1_require_edu_size_1.png... done!
## Converting page 1 to 5_2_require_prog_1.png... done!
## Converting page 1 to 5_2_require_prog_comb_1.png... done!
## Converting page 1 to 6_1_salary_histo_1.png... done!
## Converting page 1 to 6_5_rating_salary_1.png... done!